
#####
# Code for analysis of coal plant retirements 2002 - 2022
# Sam Trachtman and Jacob Alder
#####

rm(list=ls())
setwd("C:/Users/samtr/Dropbox/coal-retirements-analysis/Replication")

#load libraries
library(rio)
library(tidyverse)
library(tidyr)
library(survival)
library(psych)
library(survminer)
library(lmtest)
library(sandwich)
library(stargazer)
library(tidycensus)
library(dplyr)
library(readxl)


##### STATE AND COUNTY COVARIATES LOADING AND CLEANING ##### 

#load census data 
census_api_key("e5db78393ea3088c68f85215f4b70915d454469c")
variables <- c(
  median_income = "B19013_001",      # Median Household Income
  population = "B01003_001", 
  black_population = "B02001_003",    # Black or African American Alone
  hispanic_population = "B03001_003"  # Hispanic or Latino
)

acs_data <- get_acs(
  geography = "county",
  variables = variables,
  year = 2009,    # Year for the 5-year estimates
  survey = "acs5"
)

#import RUCC code for rural-urban covariate
rucc <- read_excel("ruralurbancodes2003.xls", sheet = 1) %>%
  select("FIPS Code", "2003 Rural-urban Continuum Code")
names(rucc) <- c("fips","rural_urban_code")

#merge and clean
pop_data <- acs_data %>%
  filter(variable=="population") %>%
  select(GEOID, estimate)
income_data <- acs_data %>%
  filter(variable=="median_income") %>%
  select(GEOID, estimate)
black_data <- acs_data %>%
  filter(variable=="black_population") %>%
  select(GEOID, estimate)
hispanic_data <- acs_data %>%
  filter(variable=="hispanic_population") %>%
  select(GEOID, estimate)
acs <- left_join(pop_data, income_data, by = "GEOID")
acs <- left_join(acs, black_data, by="GEOID")
acs <- left_join(acs, hispanic_data, by="GEOID")
names(acs) <- c("fips", "population", "median_income", "black_pop", "hispanic_pop")
acs <- left_join(acs, rucc, by="fips")
rm(black_data, hispanic_data, rucc, pop_data, income_data)

# load state-level covariates
state_covs <- import("state_level_covs.xlsx") %>%
  dplyr::select(-state) %>%
  rename("state"="abb")


##### LOAD AND CLEAN GENERATOR PANEL #####

#load panel data 2002 - 2022 
load("2024-04-11-fout.pop.covs.RData")
raw <- fout.pop.covs
rm(fout.pop.covs)

#calculating key outcome variables
raw$counter <- 1
raw$first_year <- raw$report_date=="2002-01-01"
raw$final_year <- raw$report_date=="2022-01-01"

df <- raw %>%
  mutate(
    percent_urban = tot_pop_urban/tot_pop,
    percent_black = tot_race_black/tot_pop, #note race doesn't seem to add up
    plant_age = coal_operating_weighted_mw_avg_age,
    coal_operating_mw = ifelse(is.na(coal_operating_mw), 0, coal_operating_mw), #years after retirements set as NA's, now 0's 
    gas_operating_mw = ifelse(is.na(gas_operating_mw), 0, gas_operating_mw), #years after retirements set as NA's, now 0's 
    total_operating_coal_gens = ifelse(is.na(total_operating_coal_gens), 0, total_operating_coal_gens), #years after retirements set as NA's, now 0's
    total_operating_gas_gens = ifelse(is.na(total_operating_gas_gens), 0, total_operating_gas_gens) #years after retirements set as NA's, now 0's
  ) %>%
  group_by(plant_id_eia) %>%
  mutate(
    n = sum(counter),
    coal_start = coal_operating_mw[first_year==1],
    gas_start = gas_operating_mw[first_year==1],
    sector_start = sector_id_eia[first_year==1],
    owner_start = owner[first_year==1]
  ) 
rm(raw)

#calculate percent of original coal capacity in each year 
df$perc_coal = df$coal_operating_mw/df$coal_start 
hist(df$perc_coal)

#percent of original coal capacity, max of 1 
df$perc_coal2 = ifelse(df$perc_coal<1, df$perc_coal, 1)

#examining outliers (where grew coal operating capacity significantly)
test <- subset(df, perc_coal > 2)
test2 <- test %>%
  group_by(plant_id_eia) %>%
  summarize(sum(counter))
#only 19 plants that doubled or more in capacity over period 
rm(test, test2)

#iso/rto variable pulling from middle of sequence
df$iso_code2 <- ifelse(is.na(df$iso_rto_code), 0, 1) #plants that were in iso/rto has label in middle of sequence 

#competitive market by year code
comp_market <- read_excel("2024-09-12-competitive_market_information_plant_panel.xlsx") %>%
  dplyr::select(year, utility_id_eia, plant_id_eia, competitive_market)
df <- left_join(df, comp_market, by=c("utility_id_eia", "plant_id_eia", "year"))

#sample restrictions
df <- df %>%
  filter(sector_start %in% c(1,2)) #filtering out industrial, commercial, IPP cogeneration 

#ownership vbl based on first year of data
df$owner_start = ifelse(is.na(df$owner_start), "IPP", df$owner_start)
df$ownership <- with(df,
                     ifelse(owner_start=="iou","IOU",
                            ifelse(owner_start=="coop","Co-op",
                                   ifelse(owner_start=="IPP","IPP", "POU"))))
table(df$ownership)

#cases of change in ownership 
df$owner_start2 <- ifelse(df$owner_start %in% c("fed","muni","state"), "iou",
                          ifelse(df$owner_start == "iou", "iou",
                                 ifelse(df$owner_start=="IPP", "ipp", "coop")))
df$owner2 <- ifelse(df$owner %in% c("fed","muni","state"), "iou",
                    ifelse(df$owner == "iou", "iou",
                           ifelse(df$owner=="ipp", "ipp", "coop")))
test <- df %>%
  group_by(plant_id_eia) %>%
  mutate(var_changed = ifelse(owner2 != lag(owner2, default = first(owner2)), 1, 0)) %>%
  ungroup()
df_summary <- test %>%
  group_by(plant_id_eia) %>%
  summarize(var_changed_over_period = any(var_changed == 1)) %>%
  ungroup()
sum(df_summary$var_changed_over_period, na.rm=T)
#23 cases in total 


##### SUMMARY STATISTICS AND CHARTS BY YEAR #####
#summarize by year 
sum.yr <- df %>%
  group_by(year,ownership) %>%
  summarize(coal_capacity = sum(coal_operating_mw),
            coal_gens = sum(total_operating_coal_gens),
            gas_gens = sum(total_operating_gas_gens),
            gas_capacity = sum(gas_operating_mw),
            coal_perc = sum(coal_operating_mw)/sum(coal_start),
            gas_perc = sum(gas_operating_mw)/sum(gas_start))

sum.yr.total <- df %>%
  group_by(year) %>%
  summarize(coal_capacity = sum(coal_operating_mw),
            coal_gens = sum(total_operating_coal_gens),
            gas_gens = sum(total_operating_gas_gens),
            gas_capacity = sum(gas_operating_mw),
            coal_perc = sum(coal_operating_mw)/sum(coal_start),
            gas_perc = sum(gas_operating_mw)/sum(gas_start))

#plotting capacity and # generators over time by ownership
ggplot(data = sum.yr.total, aes(x = year, y = coal_capacity/1000)) +
  geom_col(fill = "#3498db", color = "#2c3e50", width = 0.7) + # Custom colors
  labs(#title = "Figure 1: U.S. operational coal capacity, 2002-2022",
    #subtitle = "Total coal capacity over the years",
    x = "Year",
    y = "Coal capacity (in GW)",
    caption = "Source: EIA 860, accessed through Public Utility Data Liberation (PUDL)"  # Titles and labels
  ) + 
  theme_minimal(base_size = 14) + # Minimal theme with base font size
  theme(
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
    plot.subtitle = element_text(face = "italic", size = 12, hjust = 0.5),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text.y = element_text(size = 12),
    plot.caption = element_text(size = 10, face = "italic", hjust = 0),
    panel.grid.major = element_line(color = "grey85"),
    panel.grid.minor = element_blank()
  ) + # Text elements and gridlines
  scale_x_continuous(breaks = seq(min(sum.yr$year), max(sum.yr$year), by = 1)) + # Custom x-axis breaks
  scale_y_continuous(labels = scales::comma) # Formatting y-axis labels



ggplot(data = sum.yr, aes(x = year, y = coal_perc, fill = ownership)) +
  geom_col(position = "dodge", color = "#2c3e50", width = 0.7) + # Custom colors and bar width
  facet_wrap(~ ownership) + # Facet by ownership
  labs(#title = "Figure 2: U.S. operational coal capacity for electricity generation by ownership type",
    #subtitle = "Percent of 2002 total",
    x = "Year",
    y = "Operational coal capacity, % of 2002 total",
    fill = "Ownership",
    caption = "Source: EIA 860, accessed through Public Utility Data Liberation (PUDL)") + # Titles and labels
  theme_minimal(base_size = 14) + # Minimal theme with base font size
  theme(
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
    plot.subtitle = element_text(face = "italic", size = 12, hjust = 0.5),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text.y = element_text(size = 12),
    strip.text = element_text(face = "bold", size = 12), # Facet labels
    plot.caption = element_text(size = 10, face = "italic", hjust = 0),
    panel.grid.major = element_line(color = "grey85"),
    panel.grid.minor = element_blank(),
    legend.position = "bottom", # Position of the legend
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  ) + # Text elements and gridlines
  scale_x_continuous(breaks = seq(min(sum.yr$year), max(sum.yr$year), by = 1)) + # Custom x-axis breaks
  scale_y_continuous(labels = scales::percent_format()) + # Formatting y-axis labels as percentages
  scale_fill_brewer(palette = "Set2") # Custom color palette


##### COLLAPSE TO PLANT LEVEL, CLEAN, CALCULATE #####
#collapse to plant level, calculating main outcome as sum of percent coal over the sequence 
collapsed <- df %>%
  group_by(plant_id_eia) %>%
  summarize(
    start_plant_mw = total_plant_existing_mw[first_year==1], #this includes coal and other sources of power at the plant 
    coal_start = mean(coal_start, na.rm=T),
    comp_market_old = sum(iso_code2) > 0,
    comp_market2 = mean(competitive_market,na.rm=T),
    coal_end = coal_operating_mw[final_year==1],
    coal_ch = coal_operating_mw[final_year==1] - coal_operating_mw[first_year==1],
    gas_ch = gas_operating_mw[final_year==1] - gas_operating_mw[first_year==1],
    coal_ch_perc = (coal_operating_mw[final_year==1] - coal_operating_mw[first_year==1])/coal_operating_mw[first_year==1],
    perc_gens_retired = (total_operating_coal_gens[first_year==1] - total_operating_coal_gens[final_year==1])/total_operating_coal_gens[first_year==1], #this seems like funky outcome! 
    outcome = sum(perc_coal),
    outcome2 = sum(perc_coal2)
  )

#merge collapsed with variables from first year of sequence
df_firstyr <- subset(df, first_year==1) %>%
  select(plant_id_eia, utility_id_eia, state, county, owner_name, owner_utility_id_eia, ownership, owner_type, 
         public, total_plant_mw, tot_pop, entity_type, plant_age, sector_name_eia, sector_id_eia, fips)
df.plant <- left_join(collapsed, df_firstyr, by=c("plant_id_eia"))
rm(collapsed, df_firstyr)


#code ownership dummy vbl and total owner size
df.plant <- df.plant %>% 
  mutate(
    coop = as.numeric(ownership=="Co-op"), pub = as.numeric(ownership=="POU"), 
    ipp = as.numeric(ownership=="IPP"), iou = as.numeric(ownership=="IOU") 
  ) %>%
  group_by(utility_id_eia) %>% 
  mutate(
    tot_coal_assets = sum(coal_start)
  )

#merging state-level vbls 
df.plant <- left_join(df.plant, state_covs, by = "state")
df.plant <- left_join(df.plant, acs, by = "fips")

#add missing age vbls
age_missings <- import("missing_age_vbls.xlsx")
df.plant <- left_join(df.plant, age_missings, by="plant_id_eia")
df.plant$plant_age <- with(df.plant,
                           ifelse(is.na(plant_age), plant_age2, plant_age))


#descriptives
df.plant$counter <- 1
sum2 <- df.plant %>%
  group_by(ownership) %>%
  summarize(
    tot_start = sum(coal_start), 
    tot_end = sum(coal_end),
    perc_ch = (sum(coal_end) - sum(coal_start))/sum(coal_start),
    perc_ch_avg = mean(coal_ch_perc, na.rm=T),
    avg_age = mean(plant_age,na.rm=T),
    avg_size = mean(coal_start,na.rm=T),
    avg_outcome = mean(outcome,na.rm=T),
    avg_outcome2 = mean(outcome2,na.rm=T),
    avg_gas = mean(avg_gas, na.rm=T),
    avg_coal = mean(avg_coal, na.rm=T),
    n = sum(counter)
  )
#write.csv(sum2, "sum2.csv")


##### MAIN MODELS ##### 

#rescale
df.plant$start_plant_mw_adj <- df.plant$start_plant_mw/1000
df.plant$tot_coal_assets_adj <- df.plant$tot_coal_assets/1000 
df.plant$population_adj <- df.plant$population/1000000
df.plant$median_income_adj <- df.plant$median_income/1000
df.plant$black_pop_perc <- df.plant$black_pop/df.plant$population
df.plant$hispanic_pop_perc <- df.plant$hispanic_pop/df.plant$population

#code metro, adjacent, non-adjacent 
df.plant$metro <- ifelse(df.plant$rural_urban_code<4, 1, 0)
df.plant$nonmetro_adjacent <- ifelse(df.plant$rural_urban_code %in% c(8, 4, 6), 1, 0)
df.plant$nonmetro_nonadjacent <- ifelse(df.plant$rural_urban_code %in% c(5, 7, 9), 1, 0)

#covariate table
covariate.table <- df.plant %>%
  select(start_plant_mw_adj, plant_age) 
mean(df.plant$start_plant_mw_adj, na.rm=T)
mean(df.plant$plant_age, na.rm=T)
mean(df.plant$tot_coal_assets_adj,na.rm=T)
mean(df.plant$comp_market,na.rm=T)
mean(df.plant$years_dem_trifecta,na.rm=T)
mean(df.plant$years_rep_trifecta,na.rm=T)
mean(df.plant$tot_coal_assets_adj,na.rm=T)
mean(df.plant$tot_coal_assets_adj,na.rm=T)
mean(log(df.plant$gdp_per_cap+1),na.rm=T)
mean(log(df.plant$avg_coal+1),na.rm=T)
mean(log(df.plant$avg_gas+1),na.rm=T)
mean(df.plant$population_adj, na.rm=T)
mean(df.plant$median_income_adj, na.rm=T)
mean(df.plant$black_pop_perc, na.rm=T)
mean(df.plant$hispanic_pop_perc, na.rm=T)

####OPERATIONAL YEAR EQUIVALENTS
mod1 <- lm(outcome2 ~ pub + coop + ipp  
           , data = df.plant)
summary(mod1)
coeftest(mod1, vcov=vcovCL(mod1, cluster.var = utility_id_eia))

mod2 <- lm(outcome2 ~ pub + coop + ipp  
           + start_plant_mw_adj + plant_age + tot_coal_assets_adj + comp_market2 
           , data = df.plant)
summary(mod2)
coeftest(mod2, vcov=vcovCL(mod2, cluster.var = utility_id_eia))

mod3 <- lm(outcome2 ~ pub + coop + ipp  
           + start_plant_mw_adj + plant_age + tot_coal_assets_adj + comp_market2 
           + years_dem_trifecta + years_rep_trifecta + log(gdp_per_cap+1) + log(avg_coal+1) + log(avg_gas + 1) 
           + population_adj + median_income_adj + black_pop_perc + hispanic_pop_perc + nonmetro_adjacent + nonmetro_nonadjacent
           , data = df.plant)
summary(mod3)
coeftest(mod3, vcov=vcovCL(mod3, cluster.var = utility_id_eia))

